
/*
Construct Zip to County SCI based on Zip-to-zip SCI
*/

* zcta to county crosswalk 
* https://www2.census.gov/geo/docs/maps-data/data/rel/zcta_county_rel_10.txt
* https://www2.census.gov/geo/pdfs/maps-data/data/rel/explanation_zcta_county_rel_10.pdf
import delimited "${data_raw_geo}/zcta_county_rel_10.txt", clear

* keep relevant vars only
keep zcta5 geoid poppt

* assign all counties of NYC to Manhattan. This is because the Covid case count
* data is only available for all of NYC so this takes care of that
replace geoid = 36061 if inlist(geoid, 36047, 36081, 36085, 36005)
ren (zcta5 geoid) (fr_zcta county)
collapse (rawsum) poppt, by(fr_zcta county)

tempfile cw
save `cw'

* load all individual files of zip-zip SCI and collapse to zip-county level
forval i = 0/9 {
	import delimited "${data_raw_sci}/zcta_zcta_data_`i'.tsv", clear
	
	*ren (user_loc fr_loc) (user_zcta fr_zcta)
	
	* join zip to county crosswalk 
	* we have multiple obs per friend zcta in master file (because multple user 
	* zctas can have the same friend zcta) and in using file (some zctas are assigned
	* to multiple counties)
	joinby fr_zcta using `cw', unmatched(both)
	
	* multiply scaled sci with population of zcta that falls in a given county
	gen scaled_sci_pop = scaled_sci * poppt
	
	* collapse to zip-county level
	collapse (mean) scaled_sci (rawsum) scaled_sci_pop ///
		[w=poppt], by(user_zcta county)
	
	tempfile part_`i'
	save `part_`i''
	}

* combine individual files and save	
use `part_0', clear
forval i = 1/9 {
	append using `part_`i''
	}
	
save "${data_derived_sci}/zcta_to_county_SCI.dta", replace
	
